In this script, we set up the methods for analyzing the association between cell counts and an outcome of interest. We will use calendar age as an example outcome to see what happens using several methods.
We show the following methods:
We also make some plots of the distribution of variables in our dataset.
Load sample sheet.
setwd("/exports/molepi/RSC_BIOS/Users/tjonkman/cellcounts")
.libPaths("/exports/molepi/RSC_BIOS/Users/tjonkman/Packages/4.3.1")
load("01_sample_sheet.rda")
dim(ss)
## [1] 4058 27
dim(cc)
## [1] 4058 12
# Set the variable of interest. In this case: calendar age.
ss$var <- ss$age
#Print cohort characteristics.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
ss %>%
summarise(
samples = n(),
age.mean = round(mean(age), 1),
age.sd = round(sd(age), 1),
age.range = paste0(min(age), "-", max(age)),
male.female.perc = paste0(round(mean(sex == "male") * 100), "%/", round(mean(sex == "female") * 100), "%")
)
## samples age.mean age.sd age.range male.female.perc
## 1 4058 51.2 16.4 18-87 42%/58%
ss %>%
group_by(study) %>%
summarise(
samples = n(),
age.mean = round(mean(age), 1),
age.sd = round(sd(age), 1),
age.range = paste0(min(age), "-", max(age)),
male.female.perc = paste0(round(mean(sex == "male") * 100), "%/", round(mean(sex == "female") * 100), "%")
)
## # A tibble: 6 × 6
## study samples age.mean age.sd age.range male.female.perc
## <fct> <int> <dbl> <dbl> <chr> <chr>
## 1 CODAM 154 65.6 7 50-79 54%/46%
## 2 LL 811 46.8 13.6 18-81 43%/57%
## 3 LLS 714 58.3 6.8 30-79 48%/52%
## 4 NTR 1399 37.6 13.9 18-79 34%/66%
## 5 PAN 173 62.3 9.6 37-87 61%/39%
## 6 RS 807 67.6 7.1 38-87 43%/57%
Show age distribution.
library(ggplot2)
ggplot(ss, aes(x = age)) +
geom_histogram(fill = "#005580", color = "#ffffff", binwidth = 5, breaks = seq(15, 90, 5)) +
theme_bw() +
scale_x_continuous(breaks = seq(15, 90, 5)) +
scale_y_continuous(breaks = seq(0, 600, 100)) +
theme(panel.grid.minor = element_blank()) +
labs(x = "Age (years)", y = "Frequency")
Show cell count distribution.
distr <- data.frame(
row.names = colnames(ss[,11:22]),
mean = round(apply(ss[,11:22], 2, mean), 1),
sd = round(apply(ss[,11:22], 2, sd), 1)
)
#Prepare data for plotting.
celltypes <- c("Neutrophils", "Eosinophils", "Basophils", "Monocytes", "Naive B cells", "Memory B cells", "Naive CD4 T cells", "Memory CD4 T cells", "Naive CD8 T cells", "Memory CD8 T cells", "Regulatory T cells", "Natural Killer cells")
rownames(distr) <- celltypes
distr$Fraction <- factor(rownames(distr), levels = rownames(distr))
distr$mean.lab <- format(round(distr$mean, 1), 1)
distr
## mean sd Fraction mean.lab
## Neutrophils 56.8 12.1 Neutrophils 56.8
## Eosinophils 1.0 1.3 Eosinophils 1.0
## Basophils 2.8 0.7 Basophils 2.8
## Monocytes 6.2 2.8 Monocytes 6.2
## Naive B cells 2.6 1.8 Naive B cells 2.6
## Memory B cells 2.1 0.9 Memory B cells 2.1
## Naive CD4 T cells 5.0 3.6 Naive CD4 T cells 5.0
## Memory CD4 T cells 3.9 3.1 Memory CD4 T cells 3.9
## Naive CD8 T cells 2.6 2.5 Naive CD8 T cells 2.6
## Memory CD8 T cells 6.1 3.7 Memory CD8 T cells 6.1
## Regulatory T cells 3.4 1.0 Regulatory T cells 3.4
## Natural Killer cells 7.4 3.2 Natural Killer cells 7.4
#Define colors.
ct.palette <- c("#cc3333", "#a32929", "#7a1f1f", "#e68019",
"#5acc56", "#439940",
"#66baff", "#5295cc", "#3d7099", "#294a66",
"#5962b3", "#984ea3"
)
#Make pie chart.
library(shadowtext)
ggplot(distr, aes(x = "", y = mean, fill = Fraction, label = mean.lab)) +
geom_bar(stat = "identity", color = "white") +
geom_shadowtext(aes(label = mean.lab, y = mean, x = 1.35), position = position_stack(vjust = 0.5), color = "black", bg.color = "white", size = 4) +
coord_polar("y", start = 0) +
theme_minimal()+
theme(axis.text = element_blank(), axis.title = element_blank(), panel.grid = element_blank()) +
scale_fill_manual(values = ct.palette)
Plot correlations of cell types with age.
plot.data <- ss
library(reshape2)
plot.melt <- melt(plot.data[,c(1:4, 11:22)], id.vars = c("id", "age", "sex", "study"), variable.name = "Fraction", value.name = "Percentage")
levels(plot.melt$Fraction) <- celltypes
# head(plot.melt)
annots <- data.frame(
Fraction = celltypes,
age.cor = paste0("R: ", as.numeric(round(cor(plot.data$age, plot.data[,11:22]), 2)))
)
annots$Fraction <- factor(annots$Fraction, levels = unique(annots$Fraction))
ggplot(plot.melt, aes(x = age, y = Percentage, color = Fraction)) +
geom_point(shape = 1) +
geom_smooth(method = "lm", color = "#222222") +
facet_wrap(facets = vars(Fraction), scales = "free", nrow = 2, labeller = labeller(Fraction = as_labeller(1:12))) +
theme_bw() +
scale_color_manual(values = ct.palette) +
geom_shadowtext(data = annots, aes(x = 18, y = Inf, label = age.cor, color = Fraction), hjust = 0, vjust = 1.5, size = 4, color = "black", bg.color = "white") +
guides(color= "none") +
labs(x = "Age (years)") +
scale_x_continuous(breaks = c(0, 20, 40, 60, 80))
## `geom_smooth()` using formula = 'y ~ x'
Show an example of 3 people: young, old and a young person who is considered older by the clock.
#Select 3 subjects for highlighting:
#A: 50 years old, horvath DNAmAge of 50 (young).
#B: 60 years old, horvath DNAmAge of 60 (old).
#C: 50 years old, horvath DNAmAge of 60 ("biologically old").
highlight.subjects <- ss[c(1072, 261, 184), c("age", "horvath")]
highlight.subjects$horvath <- round(highlight.subjects$horvath)
# highlight.subjects
#Select 50 random subjects for the example plot, plus the 3 highlighted subjects.
set.seed(1)
ss.subset <- ss[(ss$age > 40 & ss$age < 70),]
ss.subset <- ss[sample(x = rownames(ss.subset), size = 100), c("age", "horvath")]
ss.subset <- rbind(ss.subset, highlight.subjects)
# my.limits <- c(min(ss.subset[,c("age", "horvath")]), max(ss.subset[,c("age", "horvath")]))
my.limits <- c(40, 70)
highlight.subjects$subject.no <- c("A", "B", "C")
#Plot.
ggplot(ss.subset, aes(x = age, y = horvath)) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
geom_point(size = 2, color = "#999999", alpha = 1) +
lims(x = my.limits, y = my.limits) +
labs(x = "Calendar age (years)", y = "DNAmAge (years)") +
theme_bw() +
geom_segment(x = 50, xend = 60, y = 60, yend = 60, linetype = "dashed", color = "black") +
geom_segment(x = 50, xend = 50, y = 50, yend = 60, linetype = "dashed", color = "black") +
geom_point(data = highlight.subjects, mapping = aes(x = age, y = horvath), color = c("#009900", "#004499", "#990000"), size = 4) +
geom_point(data = highlight.subjects, mapping = aes(x = age, y = horvath), color = c("#11CC11", "#1166CC", "#CC1111"), size = 2.5) +
geom_shadowtext(data = highlight.subjects, mapping = aes(x = age, y = horvath, label = subject.no), color = c("#11CC11", "#1166CC", "#CC1111"), bg.color = "white", size = 6, hjust = c(1.6, -0.6, 1.6), fontface = "bold")
## Warning: Removed 8 rows containing missing values (`geom_point()`).
Run several tests to investigate the age effect of cell types.
3 methods will be used:
#Make a placeholder column for storing a single cell type in.
ss$ct <- NaN
#Make a dataframe to store results.
res <- data.frame(
row.names = colnames(cc),
cell.type = celltypes,
estimate = NaN,
std.error = NaN,
t.stat = NaN,
p.val = NaN,
expl.var = NaN
)
# res
for(i in 1:ncol(cc)){
#Select 1 cell type.
ct <- colnames(cc)[i]
ss$ct <- ss[,ct]
#Test association with var.
fit <- lm(age ~ ct, data = ss)
#Store results
s <- summary(fit)
res[i, 2:5] <- s$coefficients["ct",]
#Which percentage of the variance of the clock is explained by this cell type?
res$expl.var[i] <- s$r.squared
}
res[,2] <- round(res[,2], 2)
res[,3] <- round(res[,3], 2)
res[,4] <- round(res[,4], 1)
res[,6] <- round(res[,6], 3)
#Calculate 95% confidence intervals.
res$ci.lower <- res$estimate - (1.96 * res$std.error)
res$ci.upper <- res$estimate + (1.96 * res$std.error)
res$cell.type <- factor(res$cell.type, levels = res$cell.type)
res.1ct <- res
res.1ct
## cell.type estimate std.error t.stat p.val expl.var
## Neu Neutrophils 0.05 0.02 2.4 1.568890e-02 0.001
## Eos Eosinophils 0.21 0.20 1.0 2.943382e-01 0.000
## Baso Basophils 1.41 0.38 3.7 2.516023e-04 0.003
## Mono Monocytes 0.79 0.09 8.6 1.270487e-17 0.018
## Bnv Naive B cells -0.75 0.14 -5.2 2.277494e-07 0.007
## Bmem Memory B cells 1.37 0.27 5.0 5.068212e-07 0.006
## CD4Tnv Naive CD4 T cells -1.11 0.07 -15.9 3.857653e-55 0.059
## CD4Tmem Memory CD4 T cells 1.24 0.08 15.3 3.201256e-51 0.054
## CD8Tnv Naive CD8 T cells -3.88 0.08 -46.4 0.000000e+00 0.346
## CD8Tmem Memory CD8 T cells 0.76 0.07 11.0 7.210563e-28 0.029
## Treg Regulatory T cells -2.82 0.25 -11.5 3.832852e-30 0.032
## NK Natural Killer cells 0.57 0.08 7.0 2.994033e-12 0.012
## ci.lower ci.upper
## Neu 0.0108 0.0892
## Eos -0.1820 0.6020
## Baso 0.6652 2.1548
## Mono 0.6136 0.9664
## Bnv -1.0244 -0.4756
## Bmem 0.8408 1.8992
## CD4Tnv -1.2472 -0.9728
## CD4Tmem 1.0832 1.3968
## CD8Tnv -4.0368 -3.7232
## CD8Tmem 0.6228 0.8972
## Treg -3.3100 -2.3300
## NK 0.4132 0.7268
#Make the plot.
plot.theme <- list(
geom_hline(yintercept = 0, linetype = "dashed"),
geom_point(color = "#005580", shape = 19, size = 1.5),
geom_errorbar(aes(ymin = ci.lower, ymax = ci.upper), width = 0.4, color = "#005580"),
labs(x = NULL, y = "Age effect (years per %pt.)"),
theme_bw(),
guides(color = "none"),
theme(axis.text.x = element_text(angle = 45, hjust = 1))
)
ggplot(res.1ct, aes(x = cell.type, y = estimate)) +
plot.theme
To illustrate that the cell types are strongly correlated with each other, make a correlation heatmap.
cor.cc <- cor(as.matrix(cc))
rownames(cor.cc) <- celltypes
colnames(cor.cc) <- celltypes
#Convert the correlation matrix to a long format for plotting.
cor.cc <- melt(cor.cc, value.name = "Correlation")
#Create the heatmap.
ggplot(cor.cc, aes(x = Var1, y = Var2, fill = Correlation)) +
geom_tile() +
geom_text(aes(label = format(round(Correlation, 1), 1)), vjust = 0.5, hjust = 0.5, size = 2.5) +
scale_fill_gradient2(low = "blue", mid = "white", high = "red",
midpoint = 0, limit = c(-1, 1)) +
labs(x = NULL, y = NULL) +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
guides(fill = "none")
To demonstrate that these correlations lead to ambiguous effect sizes, show what happens when you correct the age effect of neutrophils for different cell types.
#Make a placeholder column for storing a single cell type in.
ss$ct <- NaN
#Make a dataframe to store results.
res <- data.frame(
row.names = c("none", colnames(cc[,-1])),
cell.type = c("none", celltypes[2:12]),
estimate = NaN,
std.error = NaN,
t.stat = NaN,
p.val = NaN
)
# res
#Fill in the unadjusted neutrophil age effect.
res[1,2:5] <- res.1ct["Neu", 2:5]
#Fill in the neutrophil age effects, each time adjusted for another cell type.
for(i in 1:ncol(cc[,-1])){
#Select 1 cell type.
ct <- colnames(cc[,-1])[i]
ss$ct <- ss[,ct]
#Test association with var.
fit <- lm(age ~ Neu + ct, data = ss)
#Store results
s <- summary(fit)
res[i+1, 2:5] <- s$coefficients["Neu",]
}
res[,2] <- round(res[,2], 2)
res[,3] <- round(res[,3], 2)
res[,4] <- round(res[,4], 1)
#Calculate 95% confidence intervals.
res$ci.lower <- res$estimate - (1.96 * res$std.error)
res$ci.upper <- res$estimate + (1.96 * res$std.error)
res$cell.type <- factor(res$cell.type, levels = res$cell.type)
res.neu.corr <- res
res.neu.corr
## cell.type estimate std.error t.stat p.val ci.lower
## none none 0.05 0.02 2.4 1.568890e-02 0.0108
## Eos Eosinophils 0.09 0.03 3.6 3.325271e-04 0.0312
## Baso Basophils 0.17 0.03 6.2 6.168720e-10 0.1112
## Mono Monocytes 0.18 0.02 7.7 2.267858e-14 0.1408
## Bnv Naive B cells -0.01 0.02 -0.4 7.154132e-01 -0.0492
## Bmem Memory B cells 0.14 0.02 5.8 7.512775e-09 0.1008
## CD4Tnv Naive CD4 T cells -0.14 0.02 -5.9 4.862618e-09 -0.1792
## CD4Tmem Memory CD4 T cells 0.35 0.02 14.0 1.115648e-43 0.3108
## CD8Tnv Naive CD8 T cells -0.30 0.02 -16.5 4.003626e-59 -0.3392
## CD8Tmem Memory CD8 T cells 0.20 0.02 8.4 5.172977e-17 0.1608
## Treg Regulatory T cells -0.01 0.02 -0.2 8.067200e-01 -0.0492
## NK Natural Killer cells 0.21 0.03 8.1 1.019528e-15 0.1512
## ci.upper
## none 0.0892
## Eos 0.1488
## Baso 0.2288
## Mono 0.2192
## Bnv 0.0292
## Bmem 0.1792
## CD4Tnv -0.1008
## CD4Tmem 0.3892
## CD8Tnv -0.2608
## CD8Tmem 0.2392
## Treg 0.0292
## NK 0.2688
#Add the models.
res.neu.corr$model <- c("Neu", "Neu + Eos", "Neu + Baso", "Neu + Mono", "Neu + Bnv", "Neu + Bmem", "Neu + CD4Tnv", "Neu + CD4Tmem", "Neu + CD8Tnv", "Neu + CD8Tmem", "Neu + Treg", "Neu + NK")
res.neu.corr$model <- factor(res.neu.corr$model, levels = unique(res.neu.corr$model))
#Make the plot.
ggplot(res.neu.corr, aes(x = model, y = estimate)) +
geom_hline(yintercept = res.neu.corr["none", "estimate"], linetype = "dashed", color = "#333333", alpha = 1, linewidth = 0.5) +
geom_point(color = "#005580", shape = 19, size = 1.5) +
geom_errorbar(aes(ymin = ci.lower, ymax = ci.upper), width = 0.4, color = "#005580") +
labs(y = "Neutrophil age effect (years per %pt.)") +
theme_bw() +
guides(color = "none") +
theme(axis.text.x = element_text(angle = 45, hjust = 1), axis.title.x = element_blank())
#Empty dataframe for storing the model intercepts.
ints <- data.frame(
row.names = paste0("No ", celltypes),
ct.left.out = paste0("No ", celltypes),
intercept = rep(NaN, 12)
)
corr.12ct <- function(form, ct.left.out){
#Test association with age.
fit <- lm(form, data = ss)
#Add a dummy entry for the cell type that was set to NA.
na.ct <- data.frame(
row.names = names(fit$coefficients[13]),
estimate = NaN,
std.error = NaN,
t.stat = NaN,
p.val = NaN,
ci.lower = NaN,
ci.upper = NaN
)
#Store results.
s <- summary(fit)
res <- as.data.frame(s$coefficients)
colnames(res) <- c("estimate", "std.error", "t.stat", "p.val")
#Calculate 95% confidence intervals.
res$ci.lower <- res$estimate - (1.96 * res$std.error)
res$ci.upper <- res$estimate + (1.96 * res$std.error)
res[,1] <- round(res[,1], 2)
res[,2] <- round(res[,2], 2)
res[,3] <- round(res[,3], 1)
# res[,4] <- format(res[,4], scientific = T, digits = 2)
res[,5] <- round(res[,5], 2)
res[,6] <- round(res[,6], 2)
#Add the intercept to the dataframe.
ints[ct.left.out, 2] <<- round(res[1,1], 1)
# #Remove the intercept effect, selecting only cell type effects.
# print(res[1,])
res <- res[-(1),]
#Add the cell type that was set to NA in this model.
res <- rbind(res, na.ct)
#Re-order columns.
res <- res[c("Neu", "Eos", "Baso", "Mono", "Bnv", "Bmem", "CD4Tnv", "CD4Tmem", "CD8Tnv", "CD8Tmem", "Treg", "NK"),]
rownames(res) <- celltypes
res$ct <- factor(rownames(res), levels = rownames(res))
res$ct.left.out <- ct.left.out
return(res)
}
res.no.neu <- corr.12ct(formula(age ~ Baso + Eos + NK + Treg + CD8Tmem + CD8Tnv + CD4Tmem + CD4Tnv + Bmem + Bnv + Mono + Neu), "No Neutrophils")
res.no.eos <- corr.12ct(formula(age ~ Baso + NK + Treg + CD8Tmem + CD8Tnv + CD4Tmem + CD4Tnv + Bmem + Bnv + Mono + Neu + Eos), "No Eosinophils")
res.no.baso <- corr.12ct(formula(age ~ NK + Treg + CD8Tmem + CD8Tnv + CD4Tmem + CD4Tnv + Bmem + Bnv + Mono + Neu + Eos + Baso), "No Basophils")
res.no.mono <- corr.12ct(formula(age ~ Baso + Eos + Neu + NK + Treg + CD8Tmem + CD8Tnv + CD4Tmem + CD4Tnv + Bmem + Bnv + Mono), "No Monocytes")
res.no.bnv <- corr.12ct(formula(age ~ Baso + Eos + Neu + Mono + NK + Treg + CD8Tmem + CD8Tnv + CD4Tmem + CD4Tnv + Bmem + Bnv), "No Naive B cells")
res.no.bmem <- corr.12ct(formula(age ~ Baso + Eos + Neu + Mono + Bnv + NK + Treg + CD8Tmem + CD8Tnv + CD4Tmem + CD4Tnv + Bmem), "No Memory B cells")
res.no.cd4tnv <- corr.12ct(formula(age ~ Baso + Eos + Neu + Mono + Bnv + Bmem + NK + Treg + CD8Tmem + CD8Tnv + CD4Tmem + CD4Tnv), "No Naive CD4 T cells")
res.no.cd4tmem <- corr.12ct(formula(age ~ Baso + Eos + Neu + Mono + Bnv + Bmem + CD4Tnv + NK + Treg + CD8Tmem + CD8Tnv + CD4Tmem), "No Memory CD4 T cells")
res.no.cd8tnv <- corr.12ct(formula(age ~ Baso + Eos + Neu + Mono + Bnv + Bmem + CD4Tnv + CD4Tmem + NK + Treg + CD8Tmem + CD8Tnv), "No Naive CD8 T cells")
res.no.cd8tmem <- corr.12ct(formula(age ~ Baso + Eos + Neu + Mono + Bnv + Bmem + CD4Tnv + CD4Tmem + CD8Tnv + NK + Treg + CD8Tmem), "No Memory CD8 T cells")
res.no.treg <- corr.12ct(formula(age ~ Baso + Eos + Neu + Mono + Bnv + Bmem + CD4Tnv + CD4Tmem + CD8Tnv + CD8Tmem + NK + Treg), "No Regulatory T cells")
res.no.nk <- corr.12ct(formula(age ~ Baso + Eos + Neu + Mono + Bnv + Bmem + CD4Tnv + CD4Tmem + CD8Tnv + CD8Tmem + Treg + NK), "No Natural Killer cells")
#Plot the effect sizes of each cell type for each model. This is supposed to illustrate that these fluctuate wildly based on which cell type you leave out.
res.12ct <- rbind(res.no.neu, res.no.eos, res.no.baso, res.no.mono, res.no.bnv, res.no.bmem, res.no.cd4tnv, res.no.cd4tmem, res.no.cd8tnv, res.no.cd8tmem, res.no.treg, res.no.nk)
res.12ct$ct.left.out <- factor(res.12ct$ct.left.out, levels = unique(res.12ct$ct.left.out))
head(res.12ct)
## estimate std.error t.stat p.val ci.lower ci.upper
## Neutrophils NaN NaN NaN NaN NaN NaN
## Eosinophils -0.06 0.18 -0.3 7.430640e-01 -0.40 0.29
## Basophils 2.88 0.40 7.2 5.894134e-13 2.10 3.66
## Monocytes 0.42 0.08 5.5 4.912255e-08 0.27 0.57
## Naive B cells -0.23 0.12 -1.9 6.133143e-02 -0.47 0.01
## Memory B cells 0.49 0.25 2.0 4.873306e-02 0.00 0.97
## ct ct.left.out
## Neutrophils Neutrophils No Neutrophils
## Eosinophils Eosinophils No Neutrophils
## Basophils Basophils No Neutrophils
## Monocytes Monocytes No Neutrophils
## Naive B cells Naive B cells No Neutrophils
## Memory B cells Memory B cells No Neutrophils
#Test the proportion of variance explained by this method, compared to an intercept-only model.
#NOTE: this only has to be done once. The total effect of each 11-celltype model is exactly the same.
fit <- lm(formula = age ~ Neu + Eos + Baso + Mono + Bnv + Bmem + CD4Tnv + CD4Tmem + CD8Tnv + CD8Tmem + Treg + NK, data = ss)
summary(fit)
##
## Call:
## lm(formula = age ~ Neu + Eos + Baso + Mono + Bnv + Bmem + CD4Tnv +
## CD4Tmem + CD8Tnv + CD8Tmem + Treg + NK, data = ss)
##
## Residuals:
## Min 1Q Median 3Q Max
## -39.263 -8.510 0.755 8.696 43.848
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 61.30593 6.84405 8.958 < 2e-16 ***
## Neu -0.10786 0.06922 -1.558 0.1192
## Eos -0.16578 0.19598 -0.846 0.3977
## Baso 2.76971 0.41016 6.753 1.66e-11 ***
## Mono 0.31098 0.11310 2.750 0.0060 **
## Bnv -0.33896 0.14263 -2.377 0.0175 *
## Bmem 0.38086 0.25652 1.485 0.1377
## CD4Tnv -0.42876 0.09307 -4.607 4.21e-06 ***
## CD4Tmem 0.93182 0.10815 8.616 < 2e-16 ***
## CD8Tnv -3.90102 0.11033 -35.359 < 2e-16 ***
## CD8Tmem -0.11937 0.10272 -1.162 0.2453
## Treg -1.12477 0.23974 -4.692 2.80e-06 ***
## NK NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.37 on 4046 degrees of freedom
## Multiple R-squared: 0.435, Adjusted R-squared: 0.4335
## F-statistic: 283.2 on 11 and 4046 DF, p-value: < 2.2e-16
var.ex <- paste0("R²: ", round(summary(fit)$r.squared * 100, 1), "%")
var.ex
## [1] "R²: 43.5%"
ints$intercept <- paste0("Intercept: ", ints$intercept)
ints$ct.left.out <- factor(ints$ct.left.out, levels = ints$ct.left.out)
#Plot the effect sizes for each of the 12 models, each time leaving out a different cell type.
#Print the intercepts at the top left, and the variance explained at the top right.
ggplot(res.12ct, aes(x = ct, y = estimate)) +
plot.theme +
geom_vline(data = subset(res.12ct, is.na(estimate)), aes(xintercept = ct), color = "red", alpha = 0.5, linetype = "dashed") +
facet_wrap(facets = vars(ct.left.out), scales = "free") +
geom_text(data = ints, aes(x = 0.6, y = 8.3, label = intercept), hjust = 0, vjust = 0.6) +
annotate("text", x = 12.4, y = 8.3, label = var.ex, hjust = 1, vjust = 0.6) +
scale_y_continuous(limits = c(-7.5, 8.4), breaks = seq(-10, 10, 2))
## Warning: Removed 12 rows containing missing values (`geom_point()`).
Print separated versions of the results without naive CD8 T cells or without basophils, as examples.
ex.mod = "No Naive CD8 T cells"
res.ex <- res.12ct[res.12ct$ct.left.out == ex.mod,]
ggplot(res.ex, aes(x = ct, y = estimate)) +
plot.theme +
geom_vline(data = subset(res.ex, is.na(estimate)), aes(xintercept = ct), color = "red", alpha = 0.5, linetype = "dashed") +
geom_text(data = ints[rownames(ints) == ex.mod,], aes(x = 0.6, y = 8.3, label = intercept), hjust = 0, vjust = 0.6) +
annotate("text", x = 12.4, y = 8.3, label = var.ex, hjust = 1, vjust = 0.6) +
scale_y_continuous(limits = c(-7.5, 8.4), breaks = seq(-10, 10, 2))
## Warning: Removed 1 rows containing missing values (`geom_point()`).
ex.mod = "No Basophils"
res.ex <- res.12ct[res.12ct$ct.left.out == ex.mod,]
ggplot(res.ex, aes(x = ct, y = estimate)) +
plot.theme +
geom_vline(data = subset(res.ex, is.na(estimate)), aes(xintercept = ct), color = "red", alpha = 0.5, linetype = "dashed") +
geom_text(data = ints[rownames(ints) == ex.mod,], aes(x = 0.6, y = 8.3, label = intercept), hjust = 0, vjust = 0.6) +
annotate("text", x = 12.4, y = 8.3, label = var.ex, hjust = 1, vjust = 0.6) +
scale_y_continuous(limits = c(-7.5, 8.4), breaks = seq(-10, 10, 2))
## Warning: Removed 1 rows containing missing values (`geom_point()`).
First, calculate PCs.
#Calculate PCs in the cell counts.
set.seed(1)
pca <- prcomp(cc, scale = TRUE)
#Scree plot.
var.ex <- pca$sdev^2 / sum(pca$sdev^2)
scree.dat <- data.frame(PC = 1:length(var.ex), variance = var.ex, round.var = format(round(var.ex, 2)))
scree.dat <- scree.dat[1:11,]
ggplot(scree.dat, aes(x = PC, y = variance, label = round.var)) +
geom_point(size = 2, color = "#005580") +
geom_line(linewidth = 1, color = "#005580") +
geom_text(hjust = -0.1, vjust = -0.4, size = 3.5) +
labs(x = "Principal Component", y = "Proportion of variance explained") +
scale_x_continuous(limits = c(0.5, 11.5), breaks = 1:11) +
theme_bw() +
theme(panel.grid.minor = element_blank())
#Biplot.
biplot(pca, cex = 1, choices = c(1,2))
# Create a data frame with the principal component scores.
pca.df <- as.data.frame(pca$x[,1:12])
# Scatter plot of PC1 vs. PC2 with variance explained labeled.
ggplot(pca.df, aes(x = PC1, y = PC2)) +
geom_point(shape = 1) +
labs(x = paste0("PC1: ", round(var.ex[1]*100, 1), "%"), y = paste0("PC2: ", round(var.ex[2]*100, 1), "%")) +
ggtitle("") +
theme_bw()
Reverse PC sign if necessary to make the interpretation more intuitive.
#Correlations between PCs and cell counts.
corr.pcs <- cor(pca.df, cc)
colnames(corr.pcs) <- celltypes
#If the correlation of this fraction is below 0, inverse the sign of the PC so that it becomes positive.
for(i in 1:12){
j <- which(corr.pcs[i,]^2 == max(corr.pcs[i,]^2))
if(corr.pcs[i,j] == min(corr.pcs[i,])){
pca.df[,i] <- -(pca.df[,i])
pca$rotation[,i] <- -(pca$rotation[,i])
corr.pcs[i,] <- -(corr.pcs[i,])
}
}
#Check if the most explanatory cell type is always positive. Should be 100% TRUE.
for(i in 1:12){
j <- which(corr.pcs[i,]^2 == max(corr.pcs[i,]^2))
print(paste0(rownames(corr.pcs)[i], ": ", corr.pcs[i,j] == max(corr.pcs[i,])))
}
## [1] "PC1: TRUE"
## [1] "PC2: TRUE"
## [1] "PC3: TRUE"
## [1] "PC4: TRUE"
## [1] "PC5: TRUE"
## [1] "PC6: TRUE"
## [1] "PC7: TRUE"
## [1] "PC8: TRUE"
## [1] "PC9: TRUE"
## [1] "PC10: TRUE"
## [1] "PC11: TRUE"
## [1] "PC12: TRUE"
save(pca.df, file = "02_pca_df.rda")
Check how the PCs are correlated with individual cell counts.
plot.data <- cbind(pca.df, cc)
#Scatter plot of principal components with all 12 cell types.
plot.data <- melt(plot.data, id.vars = c("PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8", "PC9", "PC10", "PC11", "PC12"), variable.name = "cell.type", value.name = "proportion")
round(cor(pca.df, cc), 1)
## Neu Eos Baso Mono Bnv Bmem CD4Tnv CD4Tmem CD8Tnv CD8Tmem Treg NK
## PC1 1.0 -0.6 -0.7 -0.5 -0.5 -0.6 -0.4 -0.6 -0.3 -0.4 -0.3 -0.5
## PC2 0.0 0.0 0.0 -0.3 0.3 -0.2 0.7 -0.2 0.6 -0.6 0.4 -0.3
## PC3 0.1 0.0 0.3 -0.3 -0.3 0.2 -0.2 -0.5 -0.1 0.4 0.7 0.1
## PC4 0.1 0.0 0.0 -0.5 -0.1 0.6 0.1 0.4 0.0 0.1 -0.1 -0.4
## PC5 -0.1 -0.5 -0.1 -0.3 0.5 -0.1 0.0 0.0 0.1 0.4 -0.1 0.1
## PC6 -0.1 -0.1 -0.3 0.0 -0.4 0.2 0.1 -0.1 0.5 0.1 -0.2 0.3
## PC7 -0.1 -0.3 0.0 -0.1 -0.2 0.0 0.4 0.2 -0.4 -0.2 0.1 0.4
## PC8 0.0 0.5 -0.1 -0.5 0.0 -0.3 0.0 0.1 0.0 0.0 -0.1 0.3
## PC9 -0.1 0.1 -0.1 0.1 -0.1 -0.1 0.4 -0.2 -0.2 0.4 -0.1 -0.3
## PC10 0.0 0.1 -0.3 0.0 0.2 0.4 0.1 -0.3 -0.2 -0.1 0.0 0.1
## PC11 0.1 0.0 0.4 -0.1 0.0 0.1 0.1 -0.2 0.0 -0.1 -0.3 0.1
## PC12 0.7 -0.5 -0.5 -0.3 -0.4 -0.5 -0.2 -0.6 -0.2 -0.4 -0.1 -0.5
library(ggpubr)
plot.theme <- list(
geom_point(shape = 1),
geom_smooth(method = "lm"),
stat_cor(method = "pearson", cor.coef.name = "R", label.x = -Inf, label.y = 105, hjust = 0, vjust = 0.7, size = 3.5, r.digits = 1, label.sep = ", "),
facet_wrap(facets = vars(cell.type)),
theme_bw(),
theme(strip.text = element_text(size = 10)),
scale_y_continuous(limits = c(-10, 105), breaks = seq(-25, 100, 25))
)
#Correlations between PCs and cell counts.
corr.pcs <- corr.pcs[1:11,]
corr.pcs <- melt(corr.pcs, varnames = c("PC", "cell.type"), value.name = "Correlation")
corr.pcs$cell.type <- factor(corr.pcs$cell.type, levels = rev(levels(corr.pcs$cell.type)))
ggplot(corr.pcs, aes(x = PC, y = cell.type, fill = Correlation)) +
geom_tile() +
geom_text(aes(label = format(round(Correlation, 1), digits = 1)), vjust = 0.5, hjust = 0.5, size = 2.5) +
scale_fill_gradient2(low = "blue", mid = "white", high = "red",
midpoint = 0, limit = c(-1, 1)) +
labs(x = NULL, y = NULL) +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
guides(fill = "none")
Include the PCs into the model, and plot the results.
ss.pc <- cbind(ss, pca.df)
fit <- lm(formula = age ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 + PC11, data = ss.pc)
summary(fit)
##
## Call:
## lm(formula = age ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 +
## PC8 + PC9 + PC10 + PC11, data = ss.pc)
##
## Residuals:
## Min 1Q Median 3Q Max
## -39.263 -8.510 0.755 8.696 43.848
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 51.15771 0.19421 263.411 < 2e-16 ***
## PC1 0.02333 0.09792 0.238 0.81169
## PC2 -6.46157 0.14945 -43.237 < 2e-16 ***
## PC3 -0.72825 0.17574 -4.144 3.49e-05 ***
## PC4 0.99617 0.19501 5.108 3.40e-07 ***
## PC5 -1.95336 0.22413 -8.715 < 2e-16 ***
## PC6 -6.28966 0.22680 -27.732 < 2e-16 ***
## PC7 4.35027 0.23650 18.395 < 2e-16 ***
## PC8 -0.44157 0.24144 -1.829 0.06749 .
## PC9 0.83518 0.26370 3.167 0.00155 **
## PC10 0.09467 0.29267 0.323 0.74636
## PC11 0.76573 0.31814 2.407 0.01613 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.37 on 4046 degrees of freedom
## Multiple R-squared: 0.435, Adjusted R-squared: 0.4335
## F-statistic: 283.2 on 11 and 4046 DF, p-value: < 2.2e-16
var.ex <- paste0("R²: ", round(summary(fit)$r.squared * 100, 1), "%")
res <- as.data.frame(summary(fit)$coefficients)
colnames(res) <- c("estimate", "std.error", "t.stat", "p.val")
#Calculate 95% confidence intervals.
res$ci.lower <- res$estimate - (1.96 * res$std.error)
res$ci.upper <- res$estimate + (1.96 * res$std.error)
res[,1] <- round(res[,1], 2)
res[,2] <- round(res[,2], 2)
res[,3] <- round(res[,3], 1)
res[,5] <- round(res[,5], 2)
res[,6] <- round(res[,6], 2)
# #Remove the intercept and the age effect, selecting only cell type effects.
pc.int <- res[1,1]
res <- res[-(1),]
res$PC <- factor(rownames(res), levels = rownames(res))
res.pcs <- res
res.pcs
## estimate std.error t.stat p.val ci.lower ci.upper PC
## PC1 0.02 0.10 0.2 8.116918e-01 -0.17 0.22 PC1
## PC2 -6.46 0.15 -43.2 0.000000e+00 -6.75 -6.17 PC2
## PC3 -0.73 0.18 -4.1 3.486108e-05 -1.07 -0.38 PC3
## PC4 1.00 0.20 5.1 3.399769e-07 0.61 1.38 PC4
## PC5 -1.95 0.22 -8.7 4.155524e-18 -2.39 -1.51 PC5
## PC6 -6.29 0.23 -27.7 4.057491e-155 -6.73 -5.85 PC6
## PC7 4.35 0.24 18.4 1.229801e-72 3.89 4.81 PC7
## PC8 -0.44 0.24 -1.8 6.749274e-02 -0.91 0.03 PC8
## PC9 0.84 0.26 3.2 1.550561e-03 0.32 1.35 PC9
## PC10 0.09 0.29 0.3 7.463566e-01 -0.48 0.67 PC10
## PC11 0.77 0.32 2.4 1.613492e-02 0.14 1.39 PC11
plot.theme <- list(
geom_hline(yintercept = 0, linetype = "dashed"),
geom_point(color = "#005580", shape = 19, size = 1.5),
geom_errorbar(aes(ymin = ci.lower, ymax = ci.upper), color = "#005580", width = 0.4),
geom_vline(data = subset(res.pcs, is.na(estimate)), aes(xintercept = PC), color = "red", alpha = 0.5, linetype = "dashed"),
labs(x = NULL, y = "Age effect"),
theme_bw(),
guides(color = "none"),
scale_y_continuous(limits = c(-8.5, 5.0), breaks = seq(-10, 10, 2)),
theme(plot.margin = margin(t = 5.5, r = 5.5, b = 5.5, l = 22, unit = "pt")),
theme(axis.text.x = element_text(angle = 45, hjust = 1))
)
#Manually add the apparent biological meaning to each PC.
pc.labs <- c("PC1: Neutrophils", "PC2: T cell naive/memory", "PC3: Treg/CD4Tmem", "PC4: Bmem/Mono", "PC5: Bnv/Eos", "PC6: Naive CD8 T cells", "PC7: Mixed (CD4Tnv/CD8Tnv)", "PC8: Eos/Mono", "PC9: Mixed (CD4Tnv+CD8Tmem)", "PC10: Mixed (Bmem)", "PC11: Mixed (Baso)")
res.pcs$PC.label <- pc.labs
res.pcs$PC.label <- factor(res.pcs$PC.label, levels = unique(res.pcs$PC.label))
#Intercept and variance explained.
ggplot(res.pcs, aes(x = PC.label, y = estimate)) +
plot.theme +
annotate("text", x = 0.6, y = 5.0, label = paste0("Intercept: ", pc.int), hjust = 0, vjust = 0.6) +
annotate("text", x = 11.4, y = 5.0, label = var.ex, hjust = 1, vjust = 0.6)
Leave out PC2 to demonstrate that this doesn’t affect effect sizes for the remaining PCs (in contrast to the analysis using individual cell counts).
fit <- lm(formula = age ~ PC1 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 + PC11, data = ss.pc)
summary(fit)
##
## Call:
## lm(formula = age ~ PC1 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 +
## PC9 + PC10 + PC11, data = ss.pc)
##
## Residuals:
## Min 1Q Median 3Q Max
## -43.836 -11.535 1.016 11.511 48.164
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 51.15771 0.23480 217.875 < 2e-16 ***
## PC1 0.02333 0.11838 0.197 0.843781
## PC3 -0.72825 0.21247 -3.427 0.000615 ***
## PC4 0.99617 0.23577 4.225 2.44e-05 ***
## PC5 -1.95336 0.27097 -7.209 6.70e-13 ***
## PC6 -6.28966 0.27421 -22.938 < 2e-16 ***
## PC7 4.35027 0.28592 15.215 < 2e-16 ***
## PC8 -0.44157 0.29190 -1.513 0.130430
## PC9 0.83518 0.31881 2.620 0.008834 **
## PC10 0.09467 0.35384 0.268 0.789058
## PC11 0.76573 0.38464 1.991 0.046571 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.96 on 4047 degrees of freedom
## Multiple R-squared: 0.174, Adjusted R-squared: 0.1719
## F-statistic: 85.24 on 10 and 4047 DF, p-value: < 2.2e-16
test.var.ex <- paste0("R²: ", round(summary(fit)$r.squared * 100, 1), "%")
res <- as.data.frame(summary(fit)$coefficients)
colnames(res) <- c("estimate", "std.error", "t.stat", "p.val")
#Calculate 95% confidence intervals.
res$ci.lower <- res$estimate - (1.96 * res$std.error)
res$ci.upper <- res$estimate + (1.96 * res$std.error)
res[,1] <- round(res[,1], 2)
res[,2] <- round(res[,2], 2)
res[,3] <- round(res[,3], 1)
res[,5] <- round(res[,5], 2)
res[,6] <- round(res[,6], 2)
# #Remove the intercept and the age effect, selecting only cell type effects.
test.pc.int <- res[1,1]
res <- res[-(1),]
#Add a dummy PC2.
res <- rbind(res[1,], c(NaN, NaN, NaN, NaN, NaN, NaN), res[-1,])
rownames(res)[2] <- "PC2"
res$PC <- factor(rownames(res), levels = rownames(res))
res.test <- res
res.test
## estimate std.error t.stat p.val ci.lower ci.upper PC
## PC1 0.02 0.12 0.2 8.437812e-01 -0.21 0.26 PC1
## PC2 NaN NaN NaN NaN NaN NaN PC2
## PC3 -0.73 0.21 -3.4 6.153566e-04 -1.14 -0.31 PC3
## PC4 1.00 0.24 4.2 2.439152e-05 0.53 1.46 PC4
## PC5 -1.95 0.27 -7.2 6.700323e-13 -2.48 -1.42 PC5
## PC6 -6.29 0.27 -22.9 1.435043e-109 -6.83 -5.75 PC6
## PC7 4.35 0.29 15.2 7.040991e-51 3.79 4.91 PC7
## PC8 -0.44 0.29 -1.5 1.304304e-01 -1.01 0.13 PC8
## PC9 0.84 0.32 2.6 8.833748e-03 0.21 1.46 PC9
## PC10 0.09 0.35 0.3 7.890583e-01 -0.60 0.79 PC10
## PC11 0.77 0.38 2.0 4.657150e-02 0.01 1.52 PC11
#Manually add the apparent biological meaning to each PC.
res.test$PC.label <- pc.labs
res.test$PC.label <- factor(res.test$PC.label, levels = unique(res.test$PC.label))
#Intercept and variance explained.
ggplot(res.test, aes(x = PC.label, y = estimate)) +
plot.theme +
annotate("text", x = 0.6, y = 5.0, label = paste0("Intercept: ", pc.int), hjust = 0, vjust = 0.6) +
annotate("text", x = 11.4, y = 5.0, label = test.var.ex, hjust = 1, vjust = 0.6) +
geom_vline(data = subset(res.test, is.na(estimate)), aes(xintercept = PC.label), color = "red", alpha = 0.5, linetype = "dashed")
## Warning: Removed 1 rows containing missing values (`geom_point()`).
sessionInfo()
## R version 4.3.1 (2023-06-16)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Rocky Linux 8.10 (Green Obsidian)
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/libopenblas-r0.3.15.so; LAPACK version 3.9.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Europe/Amsterdam
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ggpubr_0.6.0 reshape2_1.4.4 shadowtext_0.1.3 ggplot2_3.4.3
## [5] dplyr_1.1.2
##
## loaded via a namespace (and not attached):
## [1] tidyr_1.3.0 sass_0.4.7 utf8_1.2.3 generics_0.1.3
## [5] rstatix_0.7.2 stringi_1.7.12 lattice_0.21-8 digest_0.6.33
## [9] magrittr_2.0.3 evaluate_0.21 grid_4.3.1 fastmap_1.1.1
## [13] plyr_1.8.8 jsonlite_1.8.7 Matrix_1.6-1 backports_1.4.1
## [17] mgcv_1.8-42 purrr_1.0.2 fansi_1.0.4 scales_1.2.1
## [21] jquerylib_0.1.4 abind_1.4-5 cli_3.6.1 rlang_1.1.1
## [25] munsell_0.5.0 splines_4.3.1 withr_2.5.0 cachem_1.0.8
## [29] yaml_2.3.7 tools_4.3.1 ggsignif_0.6.4 colorspace_2.1-0
## [33] broom_1.0.5 vctrs_0.6.3 R6_2.5.1 lifecycle_1.0.3
## [37] stringr_1.5.0 car_3.1-2 pkgconfig_2.0.3 pillar_1.9.0
## [41] bslib_0.5.1 gtable_0.3.4 glue_1.6.2 Rcpp_1.0.11
## [45] xfun_0.40 tibble_3.2.1 tidyselect_1.2.0 highr_0.10
## [49] rstudioapi_0.15.0 knitr_1.43 farver_2.1.1 htmltools_0.5.6
## [53] nlme_3.1-162 carData_3.0-5 rmarkdown_2.24 labeling_0.4.2
## [57] compiler_4.3.1